import pandas as pd
from Kernels.src.Analysis.Clustering import *
from sklearn.metrics import normalized_mutual_info_score, adjusted_mutual_info_score
from sklearn.metrics import adjusted_rand_score
import itertools
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
import lifelines
import statsmodels.api as sm
import itertools
from scipy import stats
import plotly.graph_objects as go
from matplotlib.offsetbox import AnchoredText
import matplotlib.font_manager as fm
fontPath = "/CTGlab/home/danilo/.fonts/Inter-VariableFont_wght.ttf"
def plotSankey(df: pd.DataFrame, cols: list, names=None,linkVar=None, valueToHighlight=None):
"""
Plot a Sankey diagram based on the given dataframe and columns.
Parameters:
- df (pd.DataFrame): The dataframe containing the data.
- cols (list): A list of column names to use for plotting.
- linkVar (str): The name of the column to use as a link variable (optional).
- valueToHighlight: The value in the link variable to highlight (optional).
Returns:
- fig: The generated plotly figure object.
"""
colsToPlot = []
for col in cols:
colsToPlot.append(list(set(x for x in df[col])))
mask = {}
toenumerate = []
for i in range(len(colsToPlot)):
toenumerate = toenumerate + colsToPlot[i]
for k,v in enumerate(toenumerate):
mask[v] = k
combinations = []
for i in range(0, len(colsToPlot)-1):
combinations = combinations + list(itertools.product(colsToPlot[i], colsToPlot[i+1]))
sources = []
targets = []
values = []
colors = []
if linkVar != None:
linkVarValues = df[linkVar].unique()
for combination in combinations:
source = mask[combination[0]]
target = mask[combination[1]]
column1 = '_'.join(combination[0].split('_')[:-1])
column2 = '_'.join(combination[1].split('_')[:-1])
if linkVar != None:
for idx,val in enumerate(linkVarValues):
flow = df.loc[(df[column1] == combination[0])&(df[column2] == combination[1])&(df[linkVar] == val)].shape[0]
color = "lightgrey" if val != valueToHighlight else "orange"
colors.append(color)
sources.append(source)
targets.append(target)
values.append(flow)
else:
flow = df.loc[(df[column1] == combination[0])&(df[column2] == combination[1])].shape[0]
colors.append('lightgrey')
sources.append(source)
targets.append(target)
values.append(flow)
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "black", width = 0.5),
label = [f"{x.split('_')[-1]}" for x in toenumerate ]
),
link = dict(
source = sources,
target = targets,
value = values,
color = colors,
label=[f"{x.split('_')[-1]}" for x in toenumerate ]
))])
if names == None:
names=cols
for x_coordinate, column_name in enumerate(names):
fig.add_annotation(
x=x_coordinate,
y=1.05,
xref="x",
yref="paper",
text=column_name,
showarrow=False,
font=dict(
size=16,
color="black"
),
align="center",
)
fig.update_layout(
title='Sample Assignments in Clustering Methods' if valueToHighlight==None else f"Sample Assignments in Clustering Methods\nHighlighted {valueToHighlight}",
plot_bgcolor='white',
paper_bgcolor='white',
font_size=14,
hovermode='x',
autosize=False,
width=1000, height=600,
xaxis={
'showgrid': False, # thin lines in the background
'zeroline': False, # thick line at x=0
'visible': False, # numbers below
},
yaxis={
'showgrid': False, # thin lines in the background
'zeroline': False, # thick line at x=0
'visible': False, # numbers below
}
)
fig.show()
return fig
def plot_KM_curves(survDf: pd.DataFrame, durationVar: str, eventVar: str, indVar: str, ax=None, groups=None, **kwargs):
"""
Plot KM curve with respect to an independent variable, including logrank test.
Parameters:
- survDf (pd.DataFrame): dataframe
- durationVar (str): Column name with survival time
- eventVar (str): Column name denoting the event. Must be encoded as boolean or int (0,1)
- indVar (str): Column name with the independent variable. If it contains more than two different values, the groups
to be tested with the log-rank must be provided with the groups argument
- ax (plt.Axes): matplotlib axes
- groups (Iterable): Iterable containing values to be tested for logrank
- **kwargs: arguments for the lifelines.KaplanMeierFitter function.
Returns:
plt.Axes
"""
levels = np.sort(survDf[indVar].dropna().unique())
if len(levels) == 2:
pval = lifelines.statistics.logrank_test(
durations_A=survDf[survDf[indVar] == levels[0]][durationVar],
durations_B=survDf[survDf[indVar] == levels[1]][durationVar],
events_observed_A=survDf[survDf[indVar] == levels[0]][eventVar],
events_observed_B=survDf[survDf[indVar] == levels[1]][eventVar]
).p_value
else:
try:
iter(groups)
pval = lifelines.statistics.logrank_test(
durations_A=survDf[survDf[indVar] == groups[0]][durationVar],
durations_B=survDf[survDf[indVar] == groups[1]][durationVar],
events_observed_A=survDf[survDf[indVar] == groups[0]][eventVar],
events_observed_B=survDf[survDf[indVar] == groups[1]][eventVar]
).p_value
except TypeError:
print('Groups require an iterable containing two variables to compare between')
return None
kmPlots = [None for _ in levels]
# calc the p-val
at = AnchoredText('$p$: {}'.format(float(np.round(pval, 7))),prop=dict(fontproperties=fm.FontProperties(fname=fontPath, size=12)), frameon=True, loc='lower left')
at.patch.set_boxstyle("round,pad=0.,rounding_size=0.2")
# ax.set_title(f'KM curve for covariate "{var}"')
ax.add_artist(at)
ax.set_facecolor('none')
ax.set_ylim(0, 1)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
for idx,level in enumerate(levels):
kmPlots[idx] = lifelines.KaplanMeierFitter()
events = (survDf[indVar] == level)
kmPlots[idx].fit(
durations=survDf[durationVar][events],
event_observed=survDf[eventVar][events],
label=level)
kmPlots[idx].plot(ax=ax, **kwargs, lw=2.5, color=sns.color_palette('colorblind')[idx])
lifelines.plotting.add_at_risk_counts(*kmPlots, ax=ax, fontproperties=fm.FontProperties(fname=fontPath, size=10))
return ax
#original input data
data_input = pd.read_csv("Input_data/UMAP_METABRIC_4cmp_IC10_full.csv", sep = ",",index_col=0)
data_input=data_input.sample(n=100,axis=0,random_state=42)
df_new_clust=pd.DataFrame()
df_new_clust.index=data_input.index
#get classical kernel
#SELECT FT
n_qubits=4
features=[]
for i in range(1,int(n_qubits)+1):
name_cna='Component_'+str(i)
features.append(name_cna)
labels = 'IntClustMemb'
print(features)
########SAMPLING###################################
y_train=data_input[labels].to_numpy()
X_train=data_input[features]
['Component_1', 'Component_2', 'Component_3', 'Component_4']
#Results data sets
df_real=pd.read_csv('/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_umap_100_hardware/Mumbai_CU/clustering_100_opt_k_reviewed.csv')
df_sim=pd.read_csv('/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_100_umap/clustering_100_opt_k_reviewed.csv')
df_sim_1000=pd.read_csv('/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_1000_umap/clustering_1000_opt_k_reviewed.csv')
#Tag simulated and real data
df_real['Type']=['Real']*len(df_real)
df_sim['Type']=['Simulated']*len(df_sim)
df_sim_1000['Type']=['Simulated']*len(df_sim_1000)
#Coherent notation
df_real.replace('Z_full','Z',inplace=True)
df_sim.replace('Z_full','Z',inplace=True)
df_sim_1000.replace('Z_full','Z',inplace=True)
df= pd.concat([df_sim, df_real], axis=0, ignore_index=True)
#Set outdir
outdir='/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_umap_100_hardware/Mumbai_CU/'
Calculate Adjusted Mutual Information(AMI) and Adjusted Rand Index(ARI) between real and simulated clustering
df_clust_real=pd.read_csv('/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_umap_100_hardware/Mumbai_CU/clustering_100_clusters.csv',index_col=0)
df_clust_sim=pd.read_csv('/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_100_umap/clustering_100_clusters.csv',index_col=0)
df_clust_real.columns = df_clust_sim.columns.str.replace("_Z_full", "_Z")
df_clust_sim.columns = df_clust_real.columns.str.replace("_Z_full", "_Z")
#calculate nmi, ami and ari between real and simulated comon columns
#save the results in a dictionary
nmi_ari_dict_list=[]
for col in df_clust_real.columns:
clust=col.split('_')
if clust[1]=='Z':
ftmap=clust[1]
elif clust[1]=='ZZ':
ftmap=clust[1]+'_'+clust[2]
else:
ftmap='rbf'
ari=adjusted_rand_score(df_clust_real[col],df_clust_sim[col])
nmi=normalized_mutual_info_score(df_clust_real[col],df_clust_sim[col])
ami=adjusted_mutual_info_score(df_clust_real[col],df_clust_sim[col])
nmi_ari_dict_list.append({'ftmap':ftmap,
'K':clust[-2],
'Bandwidth':clust[-1],
'ARI':ari,
'NMI':nmi,
'AMI':ami})
#List of dictionaries to dataframe
df_nmi_ari=pd.DataFrame(nmi_ari_dict_list)
#Select best case according to AMI
df_nmi_ari[(df_nmi_ari.ftmap !='rbf') & (df_nmi_ari.ARI==1)].sort_values(by='AMI',ascending=False)
| ftmap | K | Bandwidth | ARI | NMI | AMI | |
|---|---|---|---|---|---|---|
| 100 | Z | 3 | 0.25 | 1.0 | 1.0 | 1.0 |
| 102 | Z | 5 | 0.25 | 1.0 | 1.0 | 1.0 |
| 108 | Z | 2 | 0.5 | 1.0 | 1.0 | 1.0 |
| 109 | Z | 3 | 0.5 | 1.0 | 1.0 | 1.0 |
| 117 | Z | 2 | 1 | 1.0 | 1.0 | 1.0 |
| 118 | Z | 3 | 1 | 1.0 | 1.0 | 1.0 |
| 120 | Z | 5 | 1 | 1.0 | 1.0 | 1.0 |
| 122 | Z | 7 | 1 | 1.0 | 1.0 | 1.0 |
| 126 | Z | 2 | 2 | 1.0 | 1.0 | 1.0 |
| 128 | Z | 4 | 2 | 1.0 | 1.0 | 1.0 |
#Lets check which has the best silhouette score
best_ami_idx=df_nmi_ari[(df_nmi_ari.ftmap !='rbf') & (df_nmi_ari.ARI==1)].sort_values(by='AMI',ascending=False).index
#Plot ARI and NMI
ft_map_to_plot='Z'
score_to_plot='AMI'
sns.set_style("whitegrid")
sns.lineplot(data=df_nmi_ari[df_nmi_ari['ftmap'].isin([ft_map_to_plot])],y=score_to_plot,x='K',hue='Bandwidth',markers=True,palette='tab10')
plt.ylim(0.00,1.05)
plt.title('{} score between simulated and real quantum kernel \n for {} feature map \n'.format(score_to_plot,ft_map_to_plot,df.N_samples[0]))
plt.savefig(outdir+'{}_ftmap_{}_score.png'.format(ft_map_to_plot,score_to_plot))
#Load original patient index from data and metabric data(metadata)
#Load clustering data
print('#### CASE 1000 samples simulated ####')
df_clust=pd.read_csv('/CTGlab/home/valeria/Quantum-Machine-Learning-for-Expression-Data/Results/Unsupervised_1000_umap/clustering_1000_clusters.csv',index_col=0)
df_clust.columns = df_clust.columns.str.replace("_Z_full", "_Z")
df_clust.replace('Z_full','Z',inplace=True)
data_input = pd.read_csv("Input_data/UMAP_METABRIC_4cmp_IC10_full.csv", sep = ",",index_col=0)
patient_index=data_input.sample(n=1000,axis=0,random_state=42).index
#set index to df_clust to patient index
df_clust.index=patient_index
#Load metadata
metabric = pd.read_csv('/CTGlab/data/brca_metabric/data_clinical_patient.txt', sep='\t', skiprows=[0,1,2,3])
#Remove patients with missing OS_STATUS and merge with data
metabricSubset = metabric.loc[~metabric['OS_STATUS'].isna()]
quantumDf = pd.merge(metabricSubset, df_clust, how='left', right_on=[df_clust.index], left_on=['PATIENT_ID']).dropna(subset=df_clust.columns)
quantumDf['OS_STATUS_censored'] = quantumDf['VITAL_STATUS'].apply(lambda x: 1 if x == "Died of Disease" else 0)
quantumDf['OS_STATUS'] = quantumDf['OS_STATUS'].apply(lambda x: int(x.split(':')[0]))
#### CASE 1000 samples simulated ####
print('#### CASE 100 samples real ####')
patient_index_100=data_input.sample(n=100,axis=0,random_state=42).index
df_clust_real.index=patient_index_100
quantumDf_real_100 = pd.merge(metabricSubset, df_clust_real, how='left', right_on=[df_clust_real.index], left_on=['PATIENT_ID']).dropna(subset=df_clust_real.columns)
quantumDf_real_100['OS_STATUS_censored'] = quantumDf_real_100['VITAL_STATUS'].apply(lambda x: 1 if x == "Died of Disease" else 0)
quantumDf_real_100['OS_STATUS'] = quantumDf_real_100['OS_STATUS'].apply(lambda x: int(x.split(':')[0]))
#### CASE 100 samples real ####
print('#### CASE 100 samples sim ####')
df_clust_sim.index=patient_index_100
quantumDf_sim_100 = pd.merge(metabricSubset, df_clust_sim, how='left', right_on=[df_clust_sim.index], left_on=['PATIENT_ID']).dropna(subset=df_clust_sim.columns)
quantumDf_sim_100['OS_STATUS_censored'] = quantumDf_sim_100['VITAL_STATUS'].apply(lambda x: 1 if x == "Died of Disease" else 0)
quantumDf_sim_100['OS_STATUS'] = quantumDf_sim_100['OS_STATUS'].apply(lambda x: int(x.split(':')[0]))
#### CASE 100 samples sim ####
"Best cases" among Most coherent clustering between real and sim (100 samples)
Select "best cases" from the ideal 1000 sample simulation
#Prepare columns
cluster_col=[]
for i in quantumDf_real_100.columns:
if 'Cluster' in i:
cluster_col.append(i)
#Prepare df for sankey
quantumDf_s=quantumDf_real_100.copy()
for column in quantumDf_s.loc[:, ['INTCLUST','CLAUDIN_SUBTYPE']+cluster_col].columns:
quantumDf_s[column] = quantumDf_s[column].apply(lambda x: f"{column}_{x}")
To plot:
Note: best_ami_idx below contains the clustering with an AMI==1 and where selected in the section clustering coherence
#The ones with best silhouette
df_real.iloc[best_ami_idx].sort_values(by='silhouette',ascending=False)
| Unnamed: 0 | ftmap | K | Bandwidth | s | geom_distance | concentration | silhouette | Score_cluster | CHI | DI | v_intra | v_inter | N_samples | Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 108 | 108 | Z | 2 | 0.50 | 6057.476628 | 2.091055 | 0.066699 | 0.558708 | 0.546341 | 44.248236 | 0.878541 | 0.047888 | 0.000839 | 100 | Real |
| 109 | 109 | Z | 3 | 0.50 | 6057.476628 | 2.091055 | 0.066699 | 0.322350 | 0.390373 | 47.500560 | 0.082701 | 0.030636 | 0.065569 | 100 | Real |
| 122 | 122 | Z | 7 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.319908 | 0.396945 | 46.765743 | 0.135121 | 0.047943 | 0.022372 | 100 | Real |
| 100 | 100 | Z | 3 | 0.25 | 10403.194977 | 3.860099 | 0.019106 | 0.319376 | 0.446294 | 32.530205 | 0.134315 | 0.006838 | 0.042391 | 100 | Real |
| 102 | 102 | Z | 5 | 0.25 | 10403.194977 | 3.860099 | 0.019106 | 0.276013 | 0.477067 | 57.298958 | 0.101108 | 0.004788 | 0.032672 | 100 | Real |
| 120 | 120 | Z | 5 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.263627 | 0.285558 | 42.044629 | 0.064890 | 0.062896 | 0.024372 | 100 | Real |
| 118 | 118 | Z | 3 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.234614 | 0.188932 | 51.586242 | 0.154200 | 0.054458 | 0.009508 | 100 | Real |
| 117 | 117 | Z | 2 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.210122 | 0.188603 | 44.248236 | 0.825989 | 0.050707 | 0.000704 | 100 | Real |
| 128 | 128 | Z | 4 | 2.00 | 6454.449698 | 10.512267 | 0.023323 | 0.119377 | 0.149922 | 5.631460 | 0.264826 | 0.046313 | 0.007921 | 100 | Real |
| 126 | 126 | Z | 2 | 2.00 | 6454.449698 | 10.512267 | 0.023323 | 0.081838 | 0.082667 | 0.604951 | 0.143146 | 0.033291 | 0.008383 | 100 | Real |
#plot best 3
cols_name_S=[]
cols_name_S_sel=[]
cols_to_plot=[]
cols_to_plot_full=[]
for i in df_real.iloc[best_ami_idx].sort_values(by='silhouette',ascending=False).itertuples():
# Save all columns to plot
name='Cluster_'+i.ftmap+'_'+str((i.K))+'_'+f'{i.Bandwidth:g}'
name_col='b:{},S:{:.2f}, DI:{:.2f}'.format(i.Bandwidth,i.silhouette,i.DI)
cols_to_plot_full.append(name)
cols_name_S.append(name_col)
# Select only those with K > 3
if i.K>3:
cols_to_plot.append(name)
cols_name_S_sel.append(name_col)
First Plot also the ones with few clusters which are the ones with an higher score
fig = plotSankey(quantumDf_s, cols=cols_to_plot_full,names=cols_name_S, valueToHighlight=['High'], linkVar='CELLULARITY')
We can see from the plot above that plots with k>3 contains subclusters so they can be discharged for further considerations even though they have higher silhouette scores(s) or DI scores. We then plot only those clustering wiyh k>3:
fig = plotSankey(quantumDf_s, cols=cols_to_plot,names=cols_name_S_sel, valueToHighlight=['High'], linkVar='CELLULARITY')
Lets look at the DI following the same considerations
#The ones with best DI
df_real.iloc[best_ami_idx].sort_values(by='DI',ascending=False)
| Unnamed: 0 | ftmap | K | Bandwidth | s | geom_distance | concentration | silhouette | Score_cluster | CHI | DI | v_intra | v_inter | N_samples | Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 108 | 108 | Z | 2 | 0.50 | 6057.476628 | 2.091055 | 0.066699 | 0.558708 | 0.546341 | 44.248236 | 0.878541 | 0.047888 | 0.000839 | 100 | Real |
| 117 | 117 | Z | 2 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.210122 | 0.188603 | 44.248236 | 0.825989 | 0.050707 | 0.000704 | 100 | Real |
| 128 | 128 | Z | 4 | 2.00 | 6454.449698 | 10.512267 | 0.023323 | 0.119377 | 0.149922 | 5.631460 | 0.264826 | 0.046313 | 0.007921 | 100 | Real |
| 118 | 118 | Z | 3 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.234614 | 0.188932 | 51.586242 | 0.154200 | 0.054458 | 0.009508 | 100 | Real |
| 126 | 126 | Z | 2 | 2.00 | 6454.449698 | 10.512267 | 0.023323 | 0.081838 | 0.082667 | 0.604951 | 0.143146 | 0.033291 | 0.008383 | 100 | Real |
| 122 | 122 | Z | 7 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.319908 | 0.396945 | 46.765743 | 0.135121 | 0.047943 | 0.022372 | 100 | Real |
| 100 | 100 | Z | 3 | 0.25 | 10403.194977 | 3.860099 | 0.019106 | 0.319376 | 0.446294 | 32.530205 | 0.134315 | 0.006838 | 0.042391 | 100 | Real |
| 102 | 102 | Z | 5 | 0.25 | 10403.194977 | 3.860099 | 0.019106 | 0.276013 | 0.477067 | 57.298958 | 0.101108 | 0.004788 | 0.032672 | 100 | Real |
| 109 | 109 | Z | 3 | 0.50 | 6057.476628 | 2.091055 | 0.066699 | 0.322350 | 0.390373 | 47.500560 | 0.082701 | 0.030636 | 0.065569 | 100 | Real |
| 120 | 120 | Z | 5 | 1.00 | 4097.149781 | 3.207312 | 0.053731 | 0.263627 | 0.285558 | 42.044629 | 0.064890 | 0.062896 | 0.024372 | 100 | Real |
cols_to_plot_DI=[]
cols_name_DI=[]
for i in df_real.iloc[best_ami_idx].sort_values(by='DI',ascending=False).itertuples():
#Select only those with K > 3
if i.K>3:
name='Cluster_'+i.ftmap+'_'+str((i.K))+'_'+f'{i.Bandwidth:g}'
name_col='b:{},S:{:.2f}, DI:{:.2f}'.format(i.Bandwidth,i.silhouette,i.DI)
cols_name_DI.append(name_col)
cols_to_plot_DI.append(name)
cols_to_plot_DI
['Cluster_Z_4_2', 'Cluster_Z_7_1', 'Cluster_Z_5_0.25', 'Cluster_Z_5_1']
Below the four best clustering according to the DI score , for K>3
fig=plotSankey(quantumDf_s, cols=cols_to_plot_DI, names=cols_name_DI, valueToHighlight=['High'], linkVar='CELLULARITY')
We move further to compare the 2 best cases for each index
print(cols_to_plot[:2])
print(cols_to_plot_DI[:2])
['Cluster_Z_7_1', 'Cluster_Z_5_0.25'] ['Cluster_Z_4_2', 'Cluster_Z_7_1']
#Union of the two sets
cols_to_plot_union=list(set(cols_to_plot[:2])|(set(cols_to_plot_DI[:2])))
fig=plotSankey(quantumDf_s, cols=cols_to_plot_union, valueToHighlight=['High'], linkVar='CELLULARITY')
Looking at the recourring mini-cluster in different stratifications we looked if this difference has been inhereted from the umap DR performed
data_input_100=data_input.sample(n=100,axis=0,random_state=42)
clust=quantumDf_real_100['Cluster_Z_5_0.5']
for i in quantumDf_real_100.PATIENT_ID:
if i not in data_input_100.index:
print(i)
data_input_100
| Component_1 | Component_2 | Component_3 | Component_4 | IntClustMemb | |
|---|---|---|---|---|---|
| MB-4270 | 4.459003 | 1.762128 | 8.716180 | 3.800151 | 9 |
| MB-6068 | 3.232354 | 0.750432 | 9.716409 | 3.052997 | 4 |
| MB-0666 | 4.351296 | 3.491412 | 9.277145 | 2.818116 | 9 |
| MB-7010 | 5.542212 | 4.578652 | 9.395142 | 3.620670 | 7 |
| MB-7036 | 3.751446 | 0.916476 | 9.636867 | 2.495010 | 10 |
| ... | ... | ... | ... | ... | ... |
| MB-3351 | 3.863398 | 4.373843 | 9.367185 | 2.028247 | 3 |
| MB-0615 | 4.540109 | 3.731272 | 10.014156 | 2.037181 | 5 |
| MB-3466 | 4.534412 | 3.938404 | 9.328135 | 2.299947 | 2 |
| MB-0301 | 5.592884 | 3.783750 | 9.421013 | 2.585854 | 7 |
| MB-5602 | 4.244184 | 1.218765 | 9.299067 | 2.870626 | 10 |
100 rows × 5 columns
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.5)
clust='Cluster_Z_5_0.5'
sns.pairplot(data_input_100.merge(quantumDf_real_100[[clust,'PATIENT_ID']],how='left',right_on=['PATIENT_ID'], left_on=[data_input_100.index]), hue=clust, palette='tab10', diag_kind='kde',plot_kws={'alpha':0.5})
<seaborn.axisgrid.PairGrid at 0x7f57dc48b7c0>
Are they immortal?
fig, ax = plt.subplots(figsize=(8,8))
plot_KM_curves(
quantumDf_real_100.loc[quantumDf_real_100['OS_MONTHS'] <150 ], durationVar='OS_MONTHS',eventVar='OS_STATUS', indVar=clust, groups=[1,3], ax=ax, ci_show=False)
ax.set_xlabel('Survival, Months')
ax.set_ylabel('Overall survival probability')
Text(0, 0.5, 'Overall survival probability')
Not immortal :(
They seem to live long enought
Let's check the highest scoring clusters for the simulatted 1000 case:
quantumDf_s=quantumDf.copy()
for column in quantumDf_s.loc[:, ['INTCLUST','CLAUDIN_SUBTYPE']+cluster_col].columns:
quantumDf_s[column] = quantumDf_s[column].apply(lambda x: f"{column}_{x}")
#Drop duplicated rbf cases due to bandwidth
df_sim_1000=df_sim_1000[(df_sim_1000['ftmap'] != 'rbf') | (df_sim_1000['Bandwidth'] == 1)]
Lets start with the silhouette
df_sim_1000.sort_values(by='silhouette',ascending=False,inplace=True)
#Get only the 10 best with silhouette >0.3
df_sim_1000[df_sim_1000['silhouette']>0.3].iloc[:10]
| Unnamed: 0 | ftmap | K | Bandwidth | s | geom_distance | concentration | silhouette | Score_cluster | CHI | DI | v_intra | v_inter | N_samples | Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 135 | 135 | ZZ_linear | 2 | 0.125 | 127734.680535 | 9.484586 | 0.070194 | 0.628426 | 0.624934 | 154.232887 | 0.837206 | 0.056991 | 0.000647 | 1000 | Simulated |
| 28 | 28 | rbf | 3 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.572370 | 0.620737 | 529.898257 | 0.000929 | 0.008170 | 0.032906 | 1000 | Simulated |
| 136 | 136 | ZZ_linear | 3 | 0.125 | 127734.680535 | 9.484586 | 0.070194 | 0.551887 | 0.584336 | 408.213514 | 0.037421 | 0.025789 | 0.055347 | 1000 | Simulated |
| 27 | 27 | rbf | 2 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.542565 | 0.548716 | 670.054354 | 0.001045 | 0.019163 | 0.025802 | 1000 | Simulated |
| 29 | 29 | rbf | 4 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.419561 | 0.616219 | 547.824164 | 0.002553 | 0.005853 | 0.032249 | 1000 | Simulated |
| 32 | 32 | rbf | 7 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.407904 | 0.719146 | 507.595970 | 0.001485 | 0.003319 | 0.029624 | 1000 | Simulated |
| 30 | 30 | rbf | 5 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.404097 | 0.652419 | 525.682252 | 0.000729 | 0.005026 | 0.030578 | 1000 | Simulated |
| 33 | 33 | rbf | 8 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.395397 | 0.734598 | 482.822831 | 0.002395 | 0.003016 | 0.029523 | 1000 | Simulated |
| 31 | 31 | rbf | 6 | 1.000 | 74501.173166 | 2.500972 | 0.029959 | 0.392686 | 0.680109 | 481.499765 | 0.000729 | 0.004570 | 0.030022 | 1000 | Simulated |
| 152 | 152 | ZZ_linear | 10 | 0.250 | 74836.101545 | 5.339809 | 0.080288 | 0.391744 | 0.602663 | 321.743957 | 0.032121 | 0.026329 | 0.051701 | 1000 | Simulated |
Considering most of these are rbf, and thus not very usefull for us, we can proced and plot only the rbf ones to see cluster consistency between micro and macro clusters and then select one for reference to keep with the quantum
cols_to_plot_rbf=[]
cols_name_rbf=[]
for i in df_sim_1000[(df_sim_1000['silhouette']>0.3) & (df_sim_1000.ftmap=='rbf')].iloc[:10].itertuples():
name='Cluster_'+i.ftmap+'_'+str((i.K))+'_'+f'{i.Bandwidth:g}'
name_col='S:{:.2f}'.format(i.silhouette,i.DI)
cols_name_rbf.append(name_col)
cols_to_plot_rbf.append(name)
fig=plotSankey(quantumDf_s, cols=cols_to_plot_rbf, names=cols_name_rbf, valueToHighlight=['High'], linkVar='CELLULARITY')
We can see that overall there are two macroclusters (+1 small other) one of which then tend to stratify (cluster 0.0) and the other which seems to be more consistent. we take for reference K=7
rbf_clusters_s='Cluster_rbf_7_1'
Lets move on and take the best 5 quantum clusters
df_sim_1000[(df_sim_1000['silhouette']>0.3) & (df_sim_1000.ftmap!='rbf')].iloc[:5]
| Unnamed: 0 | ftmap | K | Bandwidth | s | geom_distance | concentration | silhouette | Score_cluster | CHI | DI | v_intra | v_inter | N_samples | Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 135 | 135 | ZZ_linear | 2 | 0.125 | 127734.680535 | 9.484586 | 0.070194 | 0.628426 | 0.624934 | 154.232887 | 0.837206 | 0.056991 | 0.000647 | 1000 | Simulated |
| 136 | 136 | ZZ_linear | 3 | 0.125 | 127734.680535 | 9.484586 | 0.070194 | 0.551887 | 0.584336 | 408.213514 | 0.037421 | 0.025789 | 0.055347 | 1000 | Simulated |
| 152 | 152 | ZZ_linear | 10 | 0.250 | 74836.101545 | 5.339809 | 0.080288 | 0.391744 | 0.602663 | 321.743957 | 0.032121 | 0.026329 | 0.051701 | 1000 | Simulated |
| 151 | 151 | ZZ_linear | 9 | 0.250 | 74836.101545 | 5.339809 | 0.080288 | 0.388783 | 0.573952 | 360.224697 | 0.029118 | 0.031645 | 0.050712 | 1000 | Simulated |
| 150 | 150 | ZZ_linear | 8 | 0.250 | 74836.101545 | 5.339809 | 0.080288 | 0.375992 | 0.558809 | 229.956558 | 0.029195 | 0.035124 | 0.051325 | 1000 | Simulated |
Since all the best clusters have been done with the ZZ linear we omit these in further plots to improve readability
cols_to_plot_q = []
cols_name_q = []
for i in df_sim_1000[(df_sim_1000['silhouette']>0.3) & (df_sim_1000.ftmap!='rbf')].iloc[:5].itertuples():
name='Cluster_'+i.ftmap+'_'+str((i.K))+'_'+f'{i.Bandwidth:g}'
name_col='b:{},S:{:.3f}'.format(i.Bandwidth,i.silhouette)
cols_name_q.append(name_col)
cols_to_plot_q.append(name)
fig=plotSankey(quantumDf_s, cols=cols_to_plot_q, names=cols_name_q, valueToHighlight=['High'], linkVar='CELLULARITY')
As for the other cases we see once again how the stratification with lower k rapresents a macrostructure in one data and tend to have a higher value of S due to the bias of the silhouette score. There seems to be a good consistency also between the finer stratifications (K=10,9,8), and considering they came from the same kernel (ftmap=ZZ_linear, b=0.25) we decide to keep K=10 as a reference since it yields the best silhouette.
q_cluster_s='Cluster_ZZ_linear_10_0.25'
Let's look at the best ones according to the DI
df_sim_1000.sort_values(by='DI',ascending=False,inplace=True)
#Check the 10 best
df_sim_1000.iloc[:10]
| Unnamed: 0 | ftmap | K | Bandwidth | s | geom_distance | concentration | silhouette | Score_cluster | CHI | DI | v_intra | v_inter | N_samples | Type | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 135 | 135 | ZZ_linear | 2 | 0.125 | 127734.680535 | 9.484586 | 0.070194 | 0.628426 | 0.624934 | 154.232887 | 0.837206 | 0.056991 | 0.000647 | 1000 | Simulated |
| 94 | 94 | Z | 6 | 0.125 | 212056.597796 | 16.760649 | 0.001171 | 0.244036 | 0.475554 | 450.758446 | 0.142593 | 0.001548 | 0.015123 | 1000 | Simulated |
| 95 | 95 | Z | 7 | 0.125 | 212056.597796 | 16.760649 | 0.001171 | 0.166334 | 0.435929 | 306.404099 | 0.122423 | 0.002044 | 0.015126 | 1000 | Simulated |
| 96 | 96 | Z | 8 | 0.125 | 212056.597796 | 16.760649 | 0.001171 | 0.114115 | 0.432576 | 262.378658 | 0.109097 | 0.002068 | 0.014980 | 1000 | Simulated |
| 89 | 89 | ZZ_full | 10 | 2.000 | 90010.792065 | 25.414386 | 0.008016 | 0.048231 | 0.092967 | 38.572960 | 0.102802 | 0.020632 | 0.005223 | 1000 | Simulated |
| 93 | 93 | Z | 5 | 0.125 | 212056.597796 | 16.760649 | 0.001171 | 0.261369 | 0.452608 | 482.960746 | 0.098428 | 0.002301 | 0.015573 | 1000 | Simulated |
| 82 | 82 | ZZ_full | 3 | 2.000 | 90010.792065 | 25.414386 | 0.008016 | 0.030379 | 0.039667 | 116.652572 | 0.092874 | 0.011679 | 0.004883 | 1000 | Simulated |
| 105 | 105 | Z | 8 | 0.250 | 154580.798965 | 8.371354 | 0.012575 | 0.317547 | 0.614840 | 437.464558 | 0.090453 | 0.001649 | 0.020288 | 1000 | Simulated |
| 107 | 107 | Z | 10 | 0.250 | 154580.798965 | 8.371354 | 0.012575 | 0.263067 | 0.611583 | 315.275127 | 0.088751 | 0.001785 | 0.020384 | 1000 | Simulated |
| 106 | 106 | Z | 9 | 0.250 | 154580.798965 | 8.371354 | 0.012575 | 0.286053 | 0.602580 | 345.848264 | 0.079558 | 0.002046 | 0.020407 | 1000 | Simulated |
Select the first 5 for plotting
cols_name_q_DI = []
cols_to_plot_q_DI = []
for i in df_sim_1000.iloc[:5].itertuples():
name='Cluster_'+i.ftmap+'_'+str((i.K))+'_'+f'{i.Bandwidth:g}'
name_col='b:{},\nDI:{:.3f}'.format(i.Bandwidth,i.DI,i.silhouette)
cols_name_q_DI.append(name_col)
cols_to_plot_q_DI.append(name)
print(cols_to_plot_q_DI)
['Cluster_ZZ_linear_2_0.125', 'Cluster_Z_6_0.125', 'Cluster_Z_7_0.125', 'Cluster_Z_8_0.125', 'Cluster_ZZ_full_10_2']
fig=plotSankey(quantumDf_s, cols=cols_to_plot_q_DI, names=cols_name_q_DI, valueToHighlight=['High'], linkVar='CELLULARITY')
As before we can see that the case for K=2 is capturing the macrostructure of the data which consist of a number of subclusters, this phenomenon is due to the DI which is affected by the same bias of the silhouette score. We can also see that we have a good level of consistency between 2nd 3rd and 4th clustering reported in the plot whereas the 5th is completly different, however it is also the one with the lowest DI and can be discharged from further analysis. Following the same approach used for the silhouette score we will keep the 2nd clustering since its the one with the best DI and considering the remaining fine stratification come form the same Kernel (ftmap:Z,b:0.125)
q_cluster_DI='Cluster_Z_6_0.125'
Now we can put togheter quantum, rbf and known stratification (INTCLUST and CLAUDINE ) and look how they are :
print(q_cluster_s,q_cluster_DI)
Cluster_ZZ_linear_10_0.25 Cluster_Z_6_0.125
fig=plotSankey(quantumDf_s, cols=[q_cluster_DI,rbf_clusters_s,q_cluster_s], names=['Z_0.125 best_DI','rbf','ZZ_linear_0.25 best_s'], valueToHighlight=['High'], linkVar='CELLULARITY')
print('AMI between rbf and ZZ_linear_0.25',adjusted_mutual_info_score(quantumDf_s[rbf_clusters_s],quantumDf_s[q_cluster_s]))
print('AMI between rbf and Z_0.125',adjusted_mutual_info_score(quantumDf_s[rbf_clusters_s],quantumDf_s[q_cluster_DI]))
print('AMI between ZZ_linear_0.25 and Z_0.125',adjusted_mutual_info_score(quantumDf_s[q_cluster_s],quantumDf_s[q_cluster_DI]))
AMI between rbf and ZZ_linear_0.25 0.619543600497348 AMI between rbf and Z_0.125 0.5063765832802456 AMI between ZZ_linear_0.25 and Z_0.125 0.4641176521182337
some consistency is present
Next:
fig=plotSankey(quantumDf_s, cols=[q_cluster_s,'CLAUDIN_SUBTYPE',q_cluster_DI], names=['ZZ_linear_0.25 (best_s)','CLAUDIN_SUBTYPE','Z_0.125 (best_DI)'], valueToHighlight=['High'], linkVar='CELLULARITY')
print('AMI between quantum stratifications:',adjusted_mutual_info_score(quantumDf_s[q_cluster_s],quantumDf_s[q_cluster_DI]))
print('AMI between Claudin and ZZ_linear_0.25:',adjusted_mutual_info_score(quantumDf_s[q_cluster_s],quantumDf_s['CLAUDIN_SUBTYPE']))
print('AMI between Claudin and Z_0.125:',adjusted_mutual_info_score(quantumDf_s[q_cluster_DI],quantumDf_s['CLAUDIN_SUBTYPE']))
AMI between quantum stratifications: 0.4641176521182337 AMI between Claudin and ZZ_linear_0.25: 0.2204539149008424 AMI between Claudin and Z_0.125: 0.1956390672955398
Some consistency but not too much we can say
Next:
fig=plotSankey(quantumDf_s, cols=[q_cluster_s,'INTCLUST',q_cluster_DI], names=['ZZ_linear_0.25 best_s','INTCLUST','Z_0.125 best_DI'], valueToHighlight=['High'], linkVar='CELLULARITY')
print('AMI between INTCLUST and ZZ_linear_0.25:', adjusted_mutual_info_score(quantumDf_s[q_cluster_s],quantumDf_s['INTCLUST']))
print('AMI between INTCLUST and Z_0.125:', adjusted_mutual_info_score(quantumDf_s[q_cluster_DI],quantumDf_s['INTCLUST']))
AMI between INTCLUST and ZZ_linear_0.25: 0.22356490851066763 AMI between INTCLUST and Z_0.125: 0.19199088944027898
Very different again
fig=plotSankey(quantumDf_s, cols=[q_cluster_s,q_cluster_DI]+['INTCLUST','CLAUDIN_SUBTYPE'] , valueToHighlight=['High'], linkVar='CELLULARITY')
Well they are not even coherent between themselfs :(
print('AMI between CLAUDIN and INTCLUST:', adjusted_mutual_info_score(quantumDf_s['CLAUDIN_SUBTYPE'],quantumDf_s['INTCLUST']))
AMI between CLAUDIN and INTCLUST: 0.24130552243880443
Lets now check how the selected clusters behave in terms of survival:
quantumDf['OS_MONTHS'].describe()
count 1000.000000 mean 128.472167 std 75.786077 min 0.000000 25% 64.541667 50% 122.733333 75% 186.558333 max 355.200000 Name: OS_MONTHS, dtype: float64
print('Best rbf: ', rbf_clusters_s )
print('Best quantum according tu Silhouette: ', q_cluster_s)
print('Best quantum according tu DI: ', q_cluster_DI)
Best rbf: Cluster_rbf_7_1 Best quantum according tu Silhouette: Cluster_ZZ_linear_10_0.25 Best quantum according tu DI: Cluster_Z_6_0.125
#Survival for best classical case
fig, ax = plt.subplots(figsize=(8,8))
plot_KM_curves(
quantumDf_real_100.loc[quantumDf_real_100['OS_MONTHS'] <186 ], durationVar='OS_MONTHS',eventVar='OS_STATUS', indVar=rbf_clusters_s, groups=list(range(0,7)), ax=ax, ci_show=False)
ax.set_xlabel('Survival, Months')
ax.set_ylabel('Overall survival probability')
Text(0, 0.5, 'Overall survival probability')
#Plot survival for best quantum case according to silhouette
fig, ax = plt.subplots(figsize=(8,8))
plot_KM_curves(
quantumDf_real_100.loc[quantumDf_real_100['OS_MONTHS'] <186], durationVar='OS_MONTHS',eventVar='OS_STATUS', indVar=q_cluster_s, groups=list(range(0,10)), ax=ax, ci_show=False)
ax.set_xlabel('Survival, Months')
ax.set_ylabel('Overall survival probability')
Text(0, 0.5, 'Overall survival probability')
#Plot survival for best quantum case according to DI
fig, ax = plt.subplots(figsize=(8,8))
plot_KM_curves(
quantumDf_real_100.loc[quantumDf_real_100['OS_MONTHS'] <186], durationVar='OS_MONTHS',eventVar='OS_STATUS', indVar=q_cluster_DI, groups=list(range(0,6)), ax=ax, ci_show=False)
ax.set_xlabel('Survival, Months')
ax.set_ylabel('Overall survival probability')
Text(0, 0.5, 'Overall survival probability')
#check intersection between two sampling sizes
print('Intersection between 100 and 1000 samples:',len(set(df_clust_sim.index)&set(df_clust.index)))
Intersection between 100 and 1000 samples: 100
Good grief! Is this a good thing?
Note: this could have been obvious if we think about how the sampling is done. Since I've fixed the random seed it simply takes a number N of samples in a random order which is fixed by the random seed, so if i want to select n < N sample it just takes the first n samples.
Here maybe is a clearer example:
# check sampling intersection with random seed
test_1=data_input.sample(n=1000,axis=0,random_state=42)
test_2=data_input.sample(n=10,axis=0,random_state=42)
print('Intersection between test sampling:',len(set(test_1.index)&set(test_2.index)))
Intersection between test sampling: 10
Okay! back to our analysis
We can check:
-AMI and ARI for every case(?)
-Sankey for the best qclusters
#Merge df_clust_sim and df_clust
df_sampling=df_clust.join(df_clust_sim,how='inner',lsuffix='_1000',rsuffix='_100')
#Caluculate ari and ami between same clusterings
ami_ari_dict_list=[]
for col in df_sampling.columns:
if '_1000' not in col:
break
clust=col.split('_')
if clust[1]=='Z':
ftmap=clust[1]
elif clust[1]=='ZZ':
ftmap=clust[1]+'_'+clust[2]
else:
ftmap='rbf'
ari=adjusted_rand_score(df_sampling['_'.join(clust[:-1])+'_1000'],df_sampling['_'.join(clust[:-1])+'_100'])
nmi=normalized_mutual_info_score(df_sampling['_'.join(clust[:-1])+'_1000'],df_sampling['_'.join(clust[:-1])+'_100'])
ami=adjusted_mutual_info_score(df_sampling['_'.join(clust[:-1])+'_1000'],df_sampling['_'.join(clust[:-1])+'_100'])
ami_ari_dict_list.append({'ftmap':ftmap,
'K':clust[-3],
'Bandwidth':clust[-2],
'ARI':ari,
'NMI':nmi,
'AMI':ami})
#List of dictionaries to dataframe
df_nmi_ari_sample=pd.DataFrame(ami_ari_dict_list)
df_nmi_ari_sample
| ftmap | K | Bandwidth | ARI | NMI | AMI | |
|---|---|---|---|---|---|---|
| 0 | rbf | 2 | 0.125 | -0.036213 | 0.025203 | 0.007628 |
| 1 | rbf | 3 | 0.125 | 0.960132 | 0.930571 | 0.928469 |
| 2 | rbf | 4 | 0.125 | 0.749738 | 0.814390 | 0.805991 |
| 3 | rbf | 5 | 0.125 | 0.476494 | 0.637160 | 0.613408 |
| 4 | rbf | 6 | 0.125 | 0.603511 | 0.739067 | 0.715228 |
| ... | ... | ... | ... | ... | ... | ... |
| 175 | ZZ_linear | 6 | 2 | 0.149792 | 0.288728 | 0.222347 |
| 176 | ZZ_linear | 7 | 2 | 0.133620 | 0.332845 | 0.247883 |
| 177 | ZZ_linear | 8 | 2 | 0.150828 | 0.385798 | 0.281938 |
| 178 | ZZ_linear | 9 | 2 | 0.179443 | 0.433988 | 0.310469 |
| 179 | ZZ_linear | 10 | 2 | 0.144174 | 0.388300 | 0.224831 |
180 rows × 6 columns
#Plot
sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.5)
sns.lineplot(data=df_nmi_ari_sample,
y='AMI',x='K',hue='ftmap',markers=True,palette='tab10')
plt.ylim(0.00,1.05)
plt.title('AMI score between 100 and 1000 samples')
Text(0.5, 1.0, 'AMI score between 100 and 1000 samples')
df_nmi_ari_sample.sort_values(by='AMI',ascending=False)
| ftmap | K | Bandwidth | ARI | NMI | AMI | |
|---|---|---|---|---|---|---|
| 135 | ZZ_linear | 2 | 0.125 | 1.000000 | 1.000000 | 1.000000 |
| 136 | ZZ_linear | 3 | 0.125 | 1.000000 | 1.000000 | 1.000000 |
| 145 | ZZ_linear | 3 | 0.25 | 0.963616 | 0.947567 | 0.946266 |
| 151 | ZZ_linear | 9 | 0.25 | 0.934655 | 0.942414 | 0.930837 |
| 1 | rbf | 3 | 0.125 | 0.960132 | 0.930571 | 0.928469 |
| ... | ... | ... | ... | ... | ... | ... |
| 36 | rbf | 2 | 2 | -0.036213 | 0.025203 | 0.007628 |
| 27 | rbf | 2 | 1 | -0.036213 | 0.025203 | 0.007628 |
| 18 | rbf | 2 | 0.5 | -0.036213 | 0.025203 | 0.007628 |
| 9 | rbf | 2 | 0.25 | -0.036213 | 0.025203 | 0.007628 |
| 0 | rbf | 2 | 0.125 | -0.036213 | 0.025203 | 0.007628 |
180 rows × 6 columns
Sankey plot
###Sankey plot
quantumDf_sample=df_sampling.copy()
#Prepare columns
cluster_col_sample=[]
for i in quantumDf_sample.columns:
if 'Cluster' in i:
cluster_col_sample.append(i)
#prepare df for sankey
for column in quantumDf_sample.columns:
quantumDf_sample[column] = quantumDf_sample[column].apply(lambda x: f"{column}_{x}")
#plot for quantum clustering (best silhouette)
fig = plotSankey(quantumDf_sample, cols=[q_cluster_s+'_1000',q_cluster_s+'_100'],names=['ZZ_linear_0.25 1000 samples','ZZ_linear_0.25 100 samples'])
print('AMI between ZZ_linear_0.25 1000 and ZZ_linear_0.25 100:',adjusted_mutual_info_score(quantumDf_sample[q_cluster_s+'_1000'],quantumDf_sample[q_cluster_s+'_100']))
AMI between ZZ_linear_0.25 1000 and ZZ_linear_0.25 100: 0.888533893993951
#plot for quantum clustering (best DI)
fig = plotSankey(quantumDf_sample, cols=[q_cluster_DI+'_1000',q_cluster_DI+'_100'],names=['Z_0.125 1000 samples','Z_0.125 100 samples'])
print('AMI between Z_0.125 1000 and Z_0.125 100:',adjusted_mutual_info_score(quantumDf_sample[q_cluster_DI+'_1000'],quantumDf_sample[q_cluster_DI+'_100']))
AMI between Z_0.125 1000 and Z_0.125 100: 0.45696903393889365
#plot for classical clustering
fig = plotSankey(quantumDf_sample, cols=[rbf_clusters_s+'_1000',rbf_clusters_s+'_100'],names=['rbf 1000 samples','rbf 100 samples'])
print('AMI between rbf 1000 and rbf 100:',adjusted_mutual_info_score(quantumDf_sample[rbf_clusters_s+'_1000'],quantumDf_sample[rbf_clusters_s+'_100']))
AMI between rbf 1000 and rbf 100: 0.6189432984522432
So for now we can say that the quantum cluster for ZZ_linear(best s) is more stable than the classical one and the Z (best DI). We have to thik about other specific metrics to evaluate the possible advantage in terms of sample complexity.